|
1 | 1 | // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-
|
2 | 2 | // vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
| 3 | + |
3 | 4 | #include "common.h"
|
| 5 | +#include "json.h" |
| 6 | +#include "json-schema-to-grammar.h" |
4 | 7 | #include "llama.h"
|
5 |
| -#include "ggml-cuda.h" |
6 |
| -#include "ggml-metal.h" |
7 | 8 |
|
8 | 9 | #include <algorithm>
|
9 | 10 | #include <cassert>
|
10 | 11 | #include <cmath>
|
11 |
| -#include <cerrno> |
12 | 12 | #include <cstring>
|
13 |
| -#include <climits> |
14 | 13 | #include <ctime>
|
15 | 14 | #include <fstream>
|
16 | 15 | #include <iterator>
|
|
74 | 73 | #define LLAMA_CURL_MAX_HEADER_LENGTH 256
|
75 | 74 | #endif // LLAMA_USE_CURL
|
76 | 75 |
|
| 76 | +using json = nlohmann::ordered_json; |
| 77 | + |
77 | 78 | int32_t get_num_physical_cores() {
|
78 | 79 | #ifdef __linux__
|
79 | 80 | // enumerate the set of thread siblings, num entries is num cores
|
@@ -110,6 +111,79 @@ int32_t get_num_physical_cores() {
|
110 | 111 | return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
111 | 112 | }
|
112 | 113 |
|
| 114 | +#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) |
| 115 | +#include <pthread.h> |
| 116 | + |
| 117 | +static void cpuid(unsigned leaf, unsigned subleaf, |
| 118 | + unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) { |
| 119 | + __asm__("movq\t%%rbx,%%rsi\n\t" |
| 120 | + "cpuid\n\t" |
| 121 | + "xchgq\t%%rbx,%%rsi" |
| 122 | + : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx) |
| 123 | + : "0"(leaf), "2"(subleaf)); |
| 124 | +} |
| 125 | + |
| 126 | +static int pin_cpu(int cpu) { |
| 127 | + cpu_set_t mask; |
| 128 | + CPU_ZERO(&mask); |
| 129 | + CPU_SET(cpu, &mask); |
| 130 | + return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask); |
| 131 | +} |
| 132 | + |
| 133 | +static bool is_hybrid_cpu(void) { |
| 134 | + unsigned eax, ebx, ecx, edx; |
| 135 | + cpuid(7, 0, &eax, &ebx, &ecx, &edx); |
| 136 | + return !!(edx & (1u << 15)); |
| 137 | +} |
| 138 | + |
| 139 | +static bool is_running_on_efficiency_core(void) { |
| 140 | + unsigned eax, ebx, ecx, edx; |
| 141 | + cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx); |
| 142 | + int intel_atom = 0x20; |
| 143 | + int core_type = (eax & 0xff000000u) >> 24; |
| 144 | + return core_type == intel_atom; |
| 145 | +} |
| 146 | + |
| 147 | +static int count_math_cpus(int cpu_count) { |
| 148 | + int result = 0; |
| 149 | + for (int cpu = 0; cpu < cpu_count; ++cpu) { |
| 150 | + if (pin_cpu(cpu)) { |
| 151 | + return -1; |
| 152 | + } |
| 153 | + if (is_running_on_efficiency_core()) { |
| 154 | + continue; // efficiency cores harm lockstep threading |
| 155 | + } |
| 156 | + ++cpu; // hyperthreading isn't useful for linear algebra |
| 157 | + ++result; |
| 158 | + } |
| 159 | + return result; |
| 160 | +} |
| 161 | + |
| 162 | +#endif // __x86_64__ && __linux__ |
| 163 | + |
| 164 | +/** |
| 165 | + * Returns number of CPUs on system that are useful for math. |
| 166 | + */ |
| 167 | +int get_math_cpu_count() { |
| 168 | +#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) |
| 169 | + int cpu_count = sysconf(_SC_NPROCESSORS_ONLN); |
| 170 | + if (cpu_count < 1) { |
| 171 | + return get_num_physical_cores(); |
| 172 | + } |
| 173 | + if (is_hybrid_cpu()) { |
| 174 | + cpu_set_t affinity; |
| 175 | + if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) { |
| 176 | + int result = count_math_cpus(cpu_count); |
| 177 | + pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity); |
| 178 | + if (result > 0) { |
| 179 | + return result; |
| 180 | + } |
| 181 | + } |
| 182 | + } |
| 183 | +#endif |
| 184 | + return get_num_physical_cores(); |
| 185 | +} |
| 186 | + |
113 | 187 | void process_escapes(std::string & input) {
|
114 | 188 | std::size_t input_len = input.length();
|
115 | 189 | std::size_t output_idx = 0;
|
@@ -1167,6 +1241,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
1167 | 1241 | );
|
1168 | 1242 | return true;
|
1169 | 1243 | }
|
| 1244 | + if (arg == "-j" || arg == "--json-schema") { |
| 1245 | + if (++i >= argc) { |
| 1246 | + invalid_param = true; |
| 1247 | + return true; |
| 1248 | + } |
| 1249 | + sparams.grammar = json_schema_to_grammar(json::parse(argv[i])); |
| 1250 | + return true; |
| 1251 | + } |
1170 | 1252 | if (arg == "--override-kv") {
|
1171 | 1253 | if (++i >= argc) {
|
1172 | 1254 | invalid_param = true;
|
@@ -1374,6 +1456,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
1374 | 1456 | printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
|
1375 | 1457 | printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
|
1376 | 1458 | printf(" --grammar-file FNAME file to read grammar from\n");
|
| 1459 | + printf(" -j SCHEMA, --json-schema SCHEMA\n"); |
| 1460 | + printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n"); |
| 1461 | + printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n"); |
1377 | 1462 | printf(" --cfg-negative-prompt PROMPT\n");
|
1378 | 1463 | printf(" negative prompt to use for guidance. (default: empty)\n");
|
1379 | 1464 | printf(" --cfg-negative-prompt-file FNAME\n");
|
@@ -1766,6 +1851,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
1766 | 1851 | cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
1767 | 1852 | cparams.pooling_type = params.pooling_type;
|
1768 | 1853 | cparams.defrag_thold = params.defrag_thold;
|
| 1854 | + cparams.cb_eval = params.cb_eval; |
| 1855 | + cparams.cb_eval_user_data = params.cb_eval_user_data; |
1769 | 1856 | cparams.offload_kqv = !params.no_kv_offload;
|
1770 | 1857 |
|
1771 | 1858 | cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
@@ -2213,7 +2300,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
2213 | 2300 | params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
2214 | 2301 | }
|
2215 | 2302 |
|
2216 |
| - { |
| 2303 | + if (params.warmup) { |
2217 | 2304 | LOG("warming up the model with an empty run\n");
|
2218 | 2305 |
|
2219 | 2306 | std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
|
0 commit comments